# importing the Necessary libraries
library(dplyr)
#Extracting the current working directory
getwd()
## [1] "/Users/kodeboyina/Documents/Kent State/Sem2/BA/Group Project"
#Loading House_Prices csv data Import the data set into R
House_Prices <- read.csv("data/House_Prices.csv", header = TRUE, sep = ",", stringsAsFactors = TRUE)

Data Over View - Descriptive Analysis

Data Overview, Providing summary of the data set, including the number of observations and variables, the data types and ranges for each variable

#Observing the first 10 Observations
head(House_Prices, n=10L)
##    LotArea OverallQual YearBuilt YearRemodAdd BsmtFinSF1 FullBath HalfBath
## 1     8450           7      2003         2003        706        2        1
## 2     9600           6      1976         1976        978        2        0
## 3    11250           7      2001         2002        486        2        1
## 4     9550           7      1915         1970        216        1        0
## 5    14260           8      2000         2000        655        2        1
## 6    14115           5      1993         1995        732        1        1
## 7    10084           8      2004         2005       1369        2        0
## 8    10382           7      1973         1973        859        2        1
## 9     6120           7      1931         1950          0        2        0
## 10    7420           5      1939         1950        851        1        0
##    BedroomAbvGr TotRmsAbvGrd Fireplaces GarageArea YrSold SalePrice
## 1             3            8          0        548   2008    208500
## 2             3            6          1        460   2007    181500
## 3             3            6          1        608   2008    223500
## 4             3            7          1        642   2006    140000
## 5             4            9          1        836   2008    250000
## 6             1            5          0        480   2009    143000
## 7             3            7          1        636   2007    307000
## 8             3            7          2        484   2009    200000
## 9             2            8          2        468   2008    129900
## 10            2            5          2        205   2008    118000
#Shape of the data set
dim(House_Prices)
## [1] 900  13
#Printing the Structure of the data
str(House_Prices)
## 'data.frame':    900 obs. of  13 variables:
##  $ LotArea     : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ OverallQual : int  7 6 7 7 8 5 8 7 7 5 ...
##  $ YearBuilt   : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ YearRemodAdd: int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ BsmtFinSF1  : int  706 978 486 216 655 732 1369 859 0 851 ...
##  $ FullBath    : int  2 2 2 1 2 1 2 2 2 1 ...
##  $ HalfBath    : int  1 0 1 0 1 1 0 1 0 0 ...
##  $ BedroomAbvGr: int  3 3 3 3 4 1 3 3 2 2 ...
##  $ TotRmsAbvGrd: int  8 6 6 7 9 5 7 7 8 5 ...
##  $ Fireplaces  : int  0 1 1 1 1 0 1 2 2 2 ...
##  $ GarageArea  : int  548 460 608 642 836 480 636 484 468 205 ...
##  $ YrSold      : int  2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
##  $ SalePrice   : int  208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
#Summary of the House Prices dataset
summary(House_Prices)
##     LotArea        OverallQual      YearBuilt     YearRemodAdd    BsmtFinSF1  
##  Min.   :  1491   Min.   : 1.00   Min.   :1880   Min.   :1950   Min.   :   0  
##  1st Qu.:  7585   1st Qu.: 5.00   1st Qu.:1954   1st Qu.:1968   1st Qu.:   0  
##  Median :  9442   Median : 6.00   Median :1973   Median :1994   Median : 384  
##  Mean   : 10795   Mean   : 6.14   Mean   :1971   Mean   :1985   Mean   : 446  
##  3rd Qu.: 11618   3rd Qu.: 7.00   3rd Qu.:2000   3rd Qu.:2004   3rd Qu.: 729  
##  Max.   :215245   Max.   :10.00   Max.   :2010   Max.   :2010   Max.   :2260  
##     FullBath       HalfBath      BedroomAbvGr   TotRmsAbvGrd     Fireplaces   
##  Min.   :0.00   Min.   :0.000   Min.   :0.00   Min.   : 2.00   Min.   :0.000  
##  1st Qu.:1.00   1st Qu.:0.000   1st Qu.:2.00   1st Qu.: 5.00   1st Qu.:0.000  
##  Median :2.00   Median :0.000   Median :3.00   Median : 6.00   Median :1.000  
##  Mean   :1.56   Mean   :0.386   Mean   :2.84   Mean   : 6.48   Mean   :0.628  
##  3rd Qu.:2.00   3rd Qu.:1.000   3rd Qu.:3.00   3rd Qu.: 7.00   3rd Qu.:1.000  
##  Max.   :3.00   Max.   :2.000   Max.   :8.00   Max.   :14.00   Max.   :3.000  
##    GarageArea       YrSold       SalePrice     
##  Min.   :   0   Min.   :2006   Min.   : 34900  
##  1st Qu.: 336   1st Qu.:2007   1st Qu.:130000  
##  Median : 480   Median :2008   Median :163000  
##  Mean   : 473   Mean   :2008   Mean   :183108  
##  3rd Qu.: 576   3rd Qu.:2009   3rd Qu.:216878  
##  Max.   :1390   Max.   :2010   Max.   :755000
library(skimr)
skim(House_Prices)
Data summary
Name House_Prices
Number of rows 900
Number of columns 13
_______________________
Column type frequency:
numeric 13
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
LotArea 0 1 10794.60 11942.21 1491 7585 9442 11618.2 215245 ▇▁▁▁▁
OverallQual 0 1 6.14 1.38 1 5 6 7.0 10 ▁▁▇▅▁
YearBuilt 0 1 1971.45 30.01 1880 1954 1973 2000.0 2010 ▁▂▃▆▇
YearRemodAdd 0 1 1985.33 20.34 1950 1968 1994 2004.0 2010 ▅▂▂▃▇
BsmtFinSF1 0 1 446.53 446.52 0 0 384 728.8 2260 ▇▅▂▁▁
FullBath 0 1 1.56 0.56 0 1 2 2.0 3 ▁▇▁▇▁
HalfBath 0 1 0.39 0.50 0 0 0 1.0 2 ▇▁▅▁▁
BedroomAbvGr 0 1 2.84 0.82 0 2 3 3.0 8 ▁▇▁▁▁
TotRmsAbvGrd 0 1 6.48 1.61 2 5 6 7.0 14 ▂▇▇▁▁
Fireplaces 0 1 0.63 0.66 0 0 1 1.0 3 ▇▇▁▂▁
GarageArea 0 1 472.61 208.85 0 336 480 576.0 1390 ▂▇▃▁▁
YrSold 0 1 2007.84 1.32 2006 2007 2008 2009.0 2010 ▇▇▇▇▅
SalePrice 0 1 183107.92 81908.18 34900 130000 163000 216877.8 755000 ▇▅▁▁▁
#Loading the data Explorer Library
library(DataExplorer)
## Plot basic description for House_Prices data
## View basic description for House_Prices data
introduce(House_Prices)
##   rows columns discrete_columns continuous_columns all_missing_columns
## 1  900      13                0                 13                   0
##   total_missing_values complete_rows total_observations memory_usage
## 1                    0           900              11700        50408
#Checking for the missing values in the House Prices 
missing_counts = colSums(is.na(House_Prices)) 

print(missing_counts)
##      LotArea  OverallQual    YearBuilt YearRemodAdd   BsmtFinSF1     FullBath 
##            0            0            0            0            0            0 
##     HalfBath BedroomAbvGr TotRmsAbvGrd   Fireplaces   GarageArea       YrSold 
##            0            0            0            0            0            0 
##    SalePrice 
##            0
#Plotting the percentage of missing values
plot_intro(House_Prices)

# Create a bar plot to visualize the missing values
barplot(missing_counts, main = "Null Values", xlab = "Variables", ylab = "Count")

# Printing the Variable types
variable_types <- sapply(House_Prices, class)
print(variable_types)
##      LotArea  OverallQual    YearBuilt YearRemodAdd   BsmtFinSF1     FullBath 
##    "integer"    "integer"    "integer"    "integer"    "integer"    "integer" 
##     HalfBath BedroomAbvGr TotRmsAbvGrd   Fireplaces   GarageArea       YrSold 
##    "integer"    "integer"    "integer"    "integer"    "integer"    "integer" 
##    SalePrice 
##    "integer"

Based on the above observation all the data types are integers and there are no categorical variables in our data

#Descriptive Analysis of the Numeric Variables
numeric_vars <- c("LotArea", "OverallQual", "BsmtFinSF1", "FullBath", "HalfBath", "BedroomAbvGr", "TotRmsAbvGrd", "Fireplaces", "GarageArea", "SalePrice")

# Set the layout for the plot grid
par(mfrow = c(2, 3))

# Create box plots for each numerical variable
for (var in numeric_vars) {

  boxplot(House_Prices[[var]], main = paste("Box Plot : ", var))
  
}

# Create histogram plots for each numerical variable
par(mfrow = c(2, 3)) 
# Set the layout for the plot grid 
for (var in numeric_vars)
  {
hist(House_Prices[[var]], main = paste("Histogram Plot : ", var), xlab = var)
} 

library(corrplot)

# Create a correlation matrix
correlation_matrix <- cor(House_Prices)

# Specify custom colors
color_scheme <- colorRampPalette(c("blue", "white", "red"))(20)



# Plot a heatmap of the correlation matrix with custom color and title
corrplot(correlation_matrix,
         method = "color",            # Use color to represent correlation values
         col = color_scheme,          # Specify custom color scheme
         title = "House Prices Correlation Heatmap",  # Specify custom title
         tl.cex = 0.8,                # Adjust text size for column and row names
         mar = c(2, 2, 1, 1)          # Adjust margins (bottom, top, left, right)
)

# Compute correlation matrix
print(correlation_matrix)
##                LotArea OverallQual YearBuilt YearRemodAdd BsmtFinSF1 FullBath
## LotArea       1.000000     0.09621  0.007639     0.012302  0.2070352  0.12855
## OverallQual   0.096209     1.00000  0.569225     0.547469  0.2273585  0.55071
## YearBuilt     0.007639     0.56922  1.000000     0.569604  0.2645981  0.46267
## YearRemodAdd  0.012302     0.54747  0.569604     1.000000  0.1322066  0.43500
## BsmtFinSF1    0.207035     0.22736  0.264598     0.132207  1.0000000  0.05284
## FullBath      0.128547     0.55071  0.462667     0.434997  0.0528409  1.00000
## HalfBath     -0.002609     0.30429  0.275349     0.205962 -0.0030281  0.12918
## BedroomAbvGr  0.089578     0.11259 -0.046072     0.004014 -0.1160040  0.36402
## TotRmsAbvGrd  0.153195     0.45870  0.128530     0.238986  0.0592867  0.56632
## Fireplaces    0.265592     0.39349  0.164903     0.122247  0.2929777  0.22522
## GarageArea    0.152720     0.59817  0.496031     0.379742  0.2869558  0.41051
## YrSold       -0.021080    -0.04878  0.008918     0.036270 -0.0007844 -0.02034
## SalePrice     0.264372     0.79621  0.526634     0.522177  0.4046632  0.55801
##               HalfBath BedroomAbvGr TotRmsAbvGrd Fireplaces GarageArea
## LotArea      -0.002609     0.089578      0.15320    0.26559    0.15272
## OverallQual   0.304286     0.112591      0.45870    0.39349    0.59817
## YearBuilt     0.275349    -0.046072      0.12853    0.16490    0.49603
## YearRemodAdd  0.205962     0.004014      0.23899    0.12225    0.37974
## BsmtFinSF1   -0.003028    -0.116004      0.05929    0.29298    0.28696
## FullBath      0.129185     0.364024      0.56632    0.22522    0.41051
## HalfBath      1.000000     0.203046      0.33171    0.21738    0.21842
## BedroomAbvGr  0.203046     1.000000      0.67145    0.07540    0.08123
## TotRmsAbvGrd  0.331714     0.671454      1.00000    0.31038    0.36196
## Fireplaces    0.217375     0.075402      0.31038    1.00000    0.26626
## GarageArea    0.218421     0.081228      0.36196    0.26626    1.00000
## YrSold       -0.023044    -0.028930     -0.06891   -0.06196   -0.04385
## SalePrice     0.304740     0.164427      0.57736    0.46863    0.65604
##                  YrSold SalePrice
## LotArea      -0.0210802   0.26437
## OverallQual  -0.0487804   0.79621
## YearBuilt     0.0089179   0.52663
## YearRemodAdd  0.0362696   0.52218
## BsmtFinSF1   -0.0007844   0.40466
## FullBath     -0.0203373   0.55801
## HalfBath     -0.0230436   0.30474
## BedroomAbvGr -0.0289300   0.16443
## TotRmsAbvGrd -0.0689141   0.57736
## Fireplaces   -0.0619571   0.46863
## GarageArea   -0.0438451   0.65604
## YrSold        1.0000000  -0.04627
## SalePrice    -0.0462718   1.00000
# Specify custom colors
color_scheme <- colorRampPalette(c("blue", "white", "red"))(20)

# Plot a heat map of the correlation matrix with custom color, title, and coefficients
corrplot(correlation_matrix,
         method = "color",            # Use color to represent correlation values
         type = "upper",              # Display only the upper triangle of the matrix
         tl.col = "black",            # Color of text for column and row names
         tl.srt = 45,                  # Rotation angle of text
         tl.cex = 0.8,                 # Text size for column and row names
         tl.offset = 1,                # Offset of text from the heat map
         addCoef.col = "black",        # Color of correlation coefficients
         number.cex = 0.7,             # Text size for correlation coefficients
         number.digits = 2,            # Number of digits for correlation coefficients
         diag = FALSE,                 # Exclude diagonal elements
         outline = TRUE                # Display outline around each cell
)

The correlation matrix provides information about the relationships between different variables in your dataset. In the provided matrix, the values range from -1 to 1, where +1 indicates a perfect positive correlation,-1 indicates a perfect negative correlation, and 0 indicates no correlation.. The correlation matrix provides insights into the relationships between various features and the response variable, SalePrice. Notably, OverallQual exhibits a strong positive correlation of 0.79621 with SalePrice, indicating a significant influence on the home’s sale value. GarageArea (correlation of 0.65604) and TotRmsAbvGrd (correlation of 0.57736) also show notable positive correlations, suggesting their impact on the sale price. These high-correlation relationships highlight the importance of these features in predicting home sale prices, emphasizing their relevance for effective analysis and model construction.

library(MASS)

#Linear Model with All the Variables included - Full model with all predictor variables
model <- lm(House_Prices$SalePrice ~ House_Prices$LotArea+House_Prices$OverallQual+House_Prices$YearBuilt+House_Prices$YearRemodAdd+House_Prices$BsmtFinSF1+House_Prices$FullBath+House_Prices$HalfBath+House_Prices$BedroomAbvGr+House_Prices$TotRmsAbvGrd+House_Prices$Fireplaces+House_Prices$GarageArea+House_Prices$YrSold) 

summary(model)
## 
## Call:
## lm(formula = House_Prices$SalePrice ~ House_Prices$LotArea + 
##     House_Prices$OverallQual + House_Prices$YearBuilt + House_Prices$YearRemodAdd + 
##     House_Prices$BsmtFinSF1 + House_Prices$FullBath + House_Prices$HalfBath + 
##     House_Prices$BedroomAbvGr + House_Prices$TotRmsAbvGrd + House_Prices$Fireplaces + 
##     House_Prices$GarageArea + House_Prices$YrSold)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -286336  -20369   -2819   16607  349565 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -1.38e+06   1.85e+06   -0.75    0.456    
## House_Prices$LotArea       7.11e-01   1.08e-01    6.59  7.7e-11 ***
## House_Prices$OverallQual   2.30e+04   1.42e+03   16.21  < 2e-16 ***
## House_Prices$YearBuilt     1.29e+02   6.08e+01    2.13    0.034 *  
## House_Prices$YearRemodAdd  3.86e+02   7.84e+01    4.92  1.0e-06 ***
## House_Prices$BsmtFinSF1    3.10e+01   3.07e+00   10.10  < 2e-16 ***
## House_Prices$FullBath      5.88e+03   3.24e+03    1.82    0.069 .  
## House_Prices$HalfBath      3.05e+03   2.79e+03    1.09    0.274    
## House_Prices$BedroomAbvGr -1.14e+04   2.16e+03   -5.26  1.8e-07 ***
## House_Prices$TotRmsAbvGrd  1.59e+04   1.34e+03   11.84  < 2e-16 ***
## House_Prices$Fireplaces    9.58e+03   2.17e+03    4.42  1.1e-05 ***
## House_Prices$GarageArea    6.11e+01   7.72e+00    7.91  7.6e-15 ***
## House_Prices$YrSold        1.30e+02   9.22e+02    0.14    0.887    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36300 on 887 degrees of freedom
## Multiple R-squared:  0.807,  Adjusted R-squared:  0.804 
## F-statistic:  308 on 12 and 887 DF,  p-value: <2e-16
step_model <- stepAIC(model, direction = "both") # Stepwise regression with both forward and backward 
## Start:  AIC=18910
## House_Prices$SalePrice ~ House_Prices$LotArea + House_Prices$OverallQual + 
##     House_Prices$YearBuilt + House_Prices$YearRemodAdd + House_Prices$BsmtFinSF1 + 
##     House_Prices$FullBath + House_Prices$HalfBath + House_Prices$BedroomAbvGr + 
##     House_Prices$TotRmsAbvGrd + House_Prices$Fireplaces + House_Prices$GarageArea + 
##     House_Prices$YrSold
## 
##                             Df Sum of Sq      RSS   AIC
## - House_Prices$YrSold        1  2.64e+07 1.17e+12 18908
## - House_Prices$HalfBath      1  1.57e+09 1.17e+12 18910
## <none>                                   1.17e+12 18910
## - House_Prices$FullBath      1  4.35e+09 1.17e+12 18912
## - House_Prices$YearBuilt     1  5.95e+09 1.17e+12 18913
## - House_Prices$Fireplaces    1  2.56e+10 1.19e+12 18928
## - House_Prices$YearRemodAdd  1  3.18e+10 1.20e+12 18933
## - House_Prices$BedroomAbvGr  1  3.64e+10 1.20e+12 18936
## - House_Prices$LotArea       1  5.70e+10 1.22e+12 18951
## - House_Prices$GarageArea    1  8.23e+10 1.25e+12 18970
## - House_Prices$BsmtFinSF1    1  1.34e+11 1.30e+12 19006
## - House_Prices$TotRmsAbvGrd  1  1.85e+11 1.35e+12 19041
## - House_Prices$OverallQual   1  3.46e+11 1.51e+12 19142
## 
## Step:  AIC=18908
## House_Prices$SalePrice ~ House_Prices$LotArea + House_Prices$OverallQual + 
##     House_Prices$YearBuilt + House_Prices$YearRemodAdd + House_Prices$BsmtFinSF1 + 
##     House_Prices$FullBath + House_Prices$HalfBath + House_Prices$BedroomAbvGr + 
##     House_Prices$TotRmsAbvGrd + House_Prices$Fireplaces + House_Prices$GarageArea
## 
##                             Df Sum of Sq      RSS   AIC
## - House_Prices$HalfBath      1  1.58e+09 1.17e+12 18908
## <none>                                   1.17e+12 18908
## - House_Prices$FullBath      1  4.36e+09 1.17e+12 18910
## + House_Prices$YrSold        1  2.64e+07 1.17e+12 18910
## - House_Prices$YearBuilt     1  5.96e+09 1.17e+12 18911
## - House_Prices$Fireplaces    1  2.56e+10 1.19e+12 18926
## - House_Prices$YearRemodAdd  1  3.21e+10 1.20e+12 18931
## - House_Prices$BedroomAbvGr  1  3.64e+10 1.20e+12 18934
## - House_Prices$LotArea       1  5.70e+10 1.22e+12 18949
## - House_Prices$GarageArea    1  8.23e+10 1.25e+12 18968
## - House_Prices$BsmtFinSF1    1  1.34e+11 1.30e+12 19005
## - House_Prices$TotRmsAbvGrd  1  1.85e+11 1.35e+12 19039
## - House_Prices$OverallQual   1  3.46e+11 1.51e+12 19140
## 
## Step:  AIC=18908
## House_Prices$SalePrice ~ House_Prices$LotArea + House_Prices$OverallQual + 
##     House_Prices$YearBuilt + House_Prices$YearRemodAdd + House_Prices$BsmtFinSF1 + 
##     House_Prices$FullBath + House_Prices$BedroomAbvGr + House_Prices$TotRmsAbvGrd + 
##     House_Prices$Fireplaces + House_Prices$GarageArea
## 
##                             Df Sum of Sq      RSS   AIC
## <none>                                   1.17e+12 18908
## - House_Prices$FullBath      1  3.34e+09 1.17e+12 18908
## + House_Prices$HalfBath      1  1.58e+09 1.17e+12 18908
## + House_Prices$YrSold        1  2.86e+07 1.17e+12 18910
## - House_Prices$YearBuilt     1  8.14e+09 1.18e+12 18912
## - House_Prices$Fireplaces    1  2.77e+10 1.20e+12 18927
## - House_Prices$YearRemodAdd  1  3.24e+10 1.20e+12 18930
## - House_Prices$BedroomAbvGr  1  3.57e+10 1.20e+12 18933
## - House_Prices$LotArea       1  5.64e+10 1.22e+12 18948
## - House_Prices$GarageArea    1  8.23e+10 1.25e+12 18967
## - House_Prices$BsmtFinSF1    1  1.33e+11 1.30e+12 19003
## - House_Prices$TotRmsAbvGrd  1  2.03e+11 1.37e+12 19050
## - House_Prices$OverallQual   1  3.51e+11 1.52e+12 19142
# Print the summary of the final stepwise regression model
summary(step_model)
## 
## Call:
## lm(formula = House_Prices$SalePrice ~ House_Prices$LotArea + 
##     House_Prices$OverallQual + House_Prices$YearBuilt + House_Prices$YearRemodAdd + 
##     House_Prices$BsmtFinSF1 + House_Prices$FullBath + House_Prices$BedroomAbvGr + 
##     House_Prices$TotRmsAbvGrd + House_Prices$Fireplaces + House_Prices$GarageArea)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -284907  -20317   -2692   16283  350668 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -1.15e+06   1.57e+05   -7.34  4.7e-13 ***
## House_Prices$LotArea       7.06e-01   1.08e-01    6.55  9.8e-11 ***
## House_Prices$OverallQual   2.31e+04   1.41e+03   16.34  < 2e-16 ***
## House_Prices$YearBuilt     1.46e+02   5.88e+01    2.49    0.013 *  
## House_Prices$YearRemodAdd  3.88e+02   7.82e+01    4.97  8.2e-07 ***
## House_Prices$BsmtFinSF1    3.05e+01   3.04e+00   10.06  < 2e-16 ***
## House_Prices$FullBath      4.98e+03   3.12e+03    1.59    0.111    
## House_Prices$BedroomAbvGr -1.12e+04   2.15e+03   -5.21  2.3e-07 ***
## House_Prices$TotRmsAbvGrd  1.62e+04   1.30e+03   12.43  < 2e-16 ***
## House_Prices$Fireplaces    9.88e+03   2.15e+03    4.60  4.9e-06 ***
## House_Prices$GarageArea    6.10e+01   7.71e+00    7.91  7.5e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36200 on 889 degrees of freedom
## Multiple R-squared:  0.806,  Adjusted R-squared:  0.804 
## F-statistic:  370 on 10 and 889 DF,  p-value: <2e-16
##Several predictor variables (e.g., LotArea, OverallQual, BsmtFinSF1, etc.) have significant coefficients.
##The Multiple R-squared value suggests that the model explains a substantial proportion of the variance in the dependent variable.
# Fit the ANOVA model for all the variables
model <- aov(House_Prices$SalePrice ~ House_Prices$LotArea+House_Prices$OverallQual+House_Prices$YearBuilt+House_Prices$YearRemodAdd+House_Prices$BsmtFinSF1+House_Prices$FullBath+House_Prices$HalfBath+House_Prices$BedroomAbvGr+House_Prices$TotRmsAbvGrd+House_Prices$Fireplaces+House_Prices$GarageArea+House_Prices$YrSold)
# Perform ANOVA analysis
anova_result <- anova(model)
# View the ANOVA table
print(anova_result)
## Analysis of Variance Table
## 
## Response: House_Prices$SalePrice
##                            Df   Sum Sq  Mean Sq F value  Pr(>F)    
## House_Prices$LotArea        1 4.22e+11 4.22e+11  320.53 < 2e-16 ***
## House_Prices$OverallQual    1 3.62e+12 3.62e+12 2750.00 < 2e-16 ***
## House_Prices$YearBuilt      1 6.07e+10 6.07e+10   46.15 2.0e-11 ***
## House_Prices$YearRemodAdd   1 3.93e+10 3.93e+10   29.92 5.9e-08 ***
## House_Prices$BsmtFinSF1     1 2.10e+11 2.10e+11  159.64 < 2e-16 ***
## House_Prices$FullBath       1 9.75e+10 9.75e+10   74.14 < 2e-16 ***
## House_Prices$HalfBath       1 4.97e+10 4.97e+10   37.79 1.2e-09 ***
## House_Prices$BedroomAbvGr   1 8.36e+09 8.36e+09    6.35   0.012 *  
## House_Prices$TotRmsAbvGrd   1 2.56e+11 2.56e+11  194.43 < 2e-16 ***
## House_Prices$Fireplaces     1 2.30e+10 2.30e+10   17.49 3.2e-05 ***
## House_Prices$GarageArea     1 8.23e+10 8.23e+10   62.56 7.7e-15 ***
## House_Prices$YrSold         1 2.64e+07 2.64e+07    0.02   0.887    
## Residuals                 887 1.17e+12 1.32e+09                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

House_Prices - OverallQual: The overall quality of the house is highly significant (p-value < 2e-16) and has a substantial effect on the sale price.

Other predictors like LotArea, BsmtFinSF1, FullBath, TotRmsAbvGrd, and GarageArea are also highly significant.

Year Built and YearRemodAdd are moderately significant predictors with p-values of 2.006e-11 and 5.864e-08, respectively.

#Evaluating the Test data set with all the features selected from the data set
raw_data_test_all <- read.csv("data/Predict.csv")

# Assuming 'model' is your trained linear regression model
predictions_all <- predict(model, newdata = raw_data_test_all)

# Assuming 'actual_values' is the column in 'test_data' containing the actual SalePrice values
actual_values <- raw_data_test_all$SalePrice

# Calculate evaluation metrics
mse <- mean((predictions_all - actual_values)^2)
rmse <- sqrt(mse)
mae <- mean(abs(predictions_all - actual_values))
r_squared <- 1 - (sum((actual_values - predictions_all)^2) / sum((actual_values - mean(actual_values))^2))

cat("Mean Squared Error (MSE):", mse, "\n")
## Mean Squared Error (MSE): 9.622e+09
cat("Root Mean Squared Error (RMSE):", rmse, "\n")
## Root Mean Squared Error (RMSE): 98093
cat("Mean Absolute Error (MAE):", mae, "\n")
## Mean Absolute Error (MAE): 78105
cat("R-squared:", r_squared, "\n")
## R-squared: -24.98

The large values for MSE, RMSE, and MAE suggest that the model’s predictions have substantial errors, and there is a considerable difference between predicted and actual values. The negative R-squared is concerning and implies that the model is not explaining the variance in Sale Prices. This could be due to over fitting, model misspecification, or the presence of outlines

End of the Analysis report : Now Let us consider construction of the model and before that we are converting the data for the years built,years modifie to Age of the Property and Age of Modifications also combining the bathrooms data in to a single column - Total Bathrooms and Also Normalizing the data set

## let construct the Regression model by considering the above assumptions
# A) Build a regression and decision tree model that can accurately predict the price of a house based on several predictors (you select appropriate features) for predicting the data of the house prices we need to consider the Training data and then analyse the data

#Loading the data sets
setwd("/Users/kodeboyina/Documents/Kent State/Sem2/BA/Group Project")


#Training Data set
raw_data <- read.csv("data/House_Prices.csv")

#From the above data set there are no missing values
#LotArea: Lot size in square feet
#BsmtFinSF1: Finished square feet
#GarageArea: Size of garage in square feet
#SalePrice: The sale price of the property.
#Having the outliers and we are normalizing the data for Scale Consistency,Equal Weight for Features

# Specify the variables to normalize
variables_to_normalize <- c("LotArea", "BsmtFinSF1", "GarageArea")

# Extract the selected variables
data_to_normalize <- raw_data[, variables_to_normalize]

# Calculate mean and standard deviation for normalization
means <- colMeans(data_to_normalize)
std_devs <- apply(data_to_normalize, 2, sd)

# Z-score normalization
normalized_data <- scale(data_to_normalize, center = means, scale = std_devs)

# Add the normalized variables back to the original data
raw_data[, variables_to_normalize] <- normalized_data
#Converting the baths information to Half and Full baths 
raw_data$ConvertedHalfBath <- ifelse(raw_data$HalfBath == 0, 0,
                                 ifelse(raw_data$HalfBath == 1, 0.5,
                                        ifelse(raw_data$HalfBath == 2, 1, NA)))
# Calculate ages from the YrSold to find the property of the age
raw_data$AgeBuilt <- raw_data$YrSold - raw_data$YearBuilt
raw_data$AgeRemodAdd <- raw_data$YrSold - raw_data$YearRemodAdd
raw_data$TotalBathrooms = raw_data$FullBath + raw_data$ConvertedHalfBath
library(corrplot)

# Assuming 'raw_data' is your data frame
House_num <- raw_data %>%
  dplyr::select(LotArea, OverallQual, BsmtFinSF1, BedroomAbvGr, TotRmsAbvGrd, 
         Fireplaces, GarageArea, SalePrice, AgeBuilt, AgeRemodAdd, TotalBathrooms)

# Create a correlation matrix
correlation_matrix <- cor(House_num)

# Specify custom colors
color_scheme <- colorRampPalette(c("blue", "white", "red"))(20)

# Plot a heat map of the correlation matrix with custom color and title
corrplot(correlation_matrix,
         method = "color",            # Use color to represent correlation values
         col = color_scheme,          # Specify custom color scheme
         title = "House Prices Correlation Heatmap",  # Specify custom title
         tl.cex = 0.8,                # Adjust text size for column and row names
         mar = c(2, 2, 1, 1)          # Adjust margins (bottom, top, left, right)
)

# Compute correlation matrix
print(correlation_matrix)
##                  LotArea OverallQual BsmtFinSF1 BedroomAbvGr TotRmsAbvGrd
## LotArea         1.000000     0.09621    0.20704     0.089578      0.15320
## OverallQual     0.096209     1.00000    0.22736     0.112591      0.45870
## BsmtFinSF1      0.207035     0.22736    1.00000    -0.116004      0.05929
## BedroomAbvGr    0.089578     0.11259   -0.11600     1.000000      0.67145
## TotRmsAbvGrd    0.153195     0.45870    0.05929     0.671454      1.00000
## Fireplaces      0.265592     0.39349    0.29298     0.075402      0.31038
## GarageArea      0.152720     0.59817    0.28696     0.081228      0.36196
## SalePrice       0.264372     0.79621    0.40466     0.164427      0.57736
## AgeBuilt       -0.008562    -0.57104   -0.26448     0.044773     -0.13149
## AgeRemodAdd    -0.013675    -0.55077   -0.13229    -0.005895     -0.24352
## TotalBathrooms  0.110986     0.59879    0.04486     0.396552      0.62311
##                Fireplaces GarageArea SalePrice  AgeBuilt AgeRemodAdd
## LotArea            0.2656    0.15272    0.2644 -0.008562   -0.013675
## OverallQual        0.3935    0.59817    0.7962 -0.571043   -0.550774
## BsmtFinSF1         0.2930    0.28696    0.4047 -0.264480   -0.132290
## BedroomAbvGr       0.0754    0.08123    0.1644  0.044773   -0.005895
## TotRmsAbvGrd       0.3104    0.36196    0.5774 -0.131488   -0.243523
## Fireplaces         1.0000    0.26626    0.4686 -0.167534   -0.126303
## GarageArea         0.2663    1.00000    0.6560 -0.497674   -0.382685
## SalePrice          0.4686    0.65604    1.0000 -0.528367   -0.525312
## AgeBuilt          -0.1675   -0.49767   -0.5284  1.000000    0.570100
## AgeRemodAdd       -0.1263   -0.38268   -0.5253  0.570100    1.000000
## TotalBathrooms     0.2812    0.44306    0.6053 -0.511646   -0.461382
##                TotalBathrooms
## LotArea               0.11099
## OverallQual           0.59879
## BsmtFinSF1            0.04486
## BedroomAbvGr          0.39655
## TotRmsAbvGrd          0.62311
## Fireplaces            0.28121
## GarageArea            0.44306
## SalePrice             0.60533
## AgeBuilt             -0.51165
## AgeRemodAdd          -0.46138
## TotalBathrooms        1.00000
# Specify custom colors
color_scheme <- colorRampPalette(c("blue", "white", "red"))(20)

# Plot a heatmap of the correlation matrix with custom color, title, and coefficients
corrplot(correlation_matrix,
         method = "color",            # Use color to represent correlation values
         type = "upper",              # Display only the upper triangle of the matrix
         tl.col = "black",            # Color of text for column and row names
         tl.srt = 45,                  # Rotation angle of text
         tl.cex = 0.8,                 # Text size for column and row names
         tl.offset = 1,                # Offset of text from the heatmap
         addCoef.col = "black",        # Color of correlation coefficients
         number.cex = 0.7,             # Text size for correlation coefficients
         number.digits = 2,            # Number of digits for correlation coefficients
         diag = FALSE,                 # Exclude diagonal elements
         outline = TRUE                # Display outline around each cell
)

Strong Positive Correlations with SalePrice:. OverallQual (0.796): This variable has the highest positive correlation with SalePrice. The overall quality of the house, as rated by a numeric scale, is a strong predictor of the sale price..

Moderate Positive Correlations with SalePrice:. GarageArea (0.656): The size of the garage has a moderate positive correlation with SalePrice.. TotalBathrooms (0.605): The total number of bathrooms shows a moderate positive correlation with SalePrice.. TotRmsAbvGrd (0.577): The total rooms above ground also has a moderate positive correlation with SalePrice.. Fireplaces (0.469): The number of fireplaces in the house has a moderate positive correlation with SalePrice..

Negative Correlations with SalePrice:. AgeBuilt (-0.528): The age of the house (how many years it has been since it was built) has a moderate negative correlation with SalePrice.. AgeRemodAdd (-0.525): The age since the last remodel also has a moderate negative correlation with SalePrice..

# Assuming 'data' is your data frame with independent variables
# Load the 'car' package
library(car)
vif_model <- lm(SalePrice ~ OverallQual +BsmtFinSF1+ GarageArea + BedroomAbvGr + TotRmsAbvGrd + Fireplaces + AgeBuilt + AgeRemodAdd + TotalBathrooms, data = House_num)
vif_values <- vif(vif_model)
print(vif_values)
##    OverallQual     BsmtFinSF1     GarageArea   BedroomAbvGr   TotRmsAbvGrd 
##          2.606          1.249          1.761          2.121          3.171 
##     Fireplaces       AgeBuilt    AgeRemodAdd TotalBathrooms 
##          1.327          2.254          1.732          2.603

Variance Inflation Factors (VIF) for different predictor variables in a linear regression model variance of an estimated regression coefficient increases if your predictors are correlated.. Here we can see that VIF vales are less than 5 are considered acceptable, indicating a low to moderate level of multicollinearity

#Using pairs to calculate multi-collinearity
# Assuming 'House_num' is your data frame
# Specify the columns you want to include in the pairs plot
# Specify Columns to Plot
columns_to_plot <- c("OverallQual", "BsmtFinSF1", "GarageArea", "BedroomAbvGr", "TotRmsAbvGrd",  "TotalBathrooms","Fireplaces", "AgeBuilt", "AgeRemodAdd")

# Plot the Pairs
pairs(House_num[, columns_to_plot])

All the values have a VIF values below 5 are often considered acceptable and there is no significant multi-collinearity in the data. Here we can see that the there is linear relation ship between the Number of Bedrooms above the ground and Total Rooms above ground so in order to remove multi-collinearity we are removing the variable bedrooms above ground for better prediction over test data

# Create Regression Model with the 
reg_model <- lm(SalePrice ~  OverallQual+BsmtFinSF1+GarageArea+TotRmsAbvGrd+TotalBathrooms+Fireplaces+AgeBuilt+AgeRemodAdd, data = House_num)

summary(reg_model)
## 
## Call:
## lm(formula = SalePrice ~ OverallQual + BsmtFinSF1 + GarageArea + 
##     TotRmsAbvGrd + TotalBathrooms + Fireplaces + AgeBuilt + AgeRemodAdd, 
##     data = House_num)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -263084  -22065   -2886   16510  344730 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -42491.6     9930.6   -4.28  2.1e-05 ***
## OverallQual     23592.9     1449.6   16.28  < 2e-16 ***
## BsmtFinSF1      15936.3     1393.3   11.44  < 2e-16 ***
## GarageArea      14355.2     1657.5    8.66  < 2e-16 ***
## TotRmsAbvGrd    11747.6     1097.5   10.70  < 2e-16 ***
## TotalBathrooms   5293.0     3149.4    1.68    0.093 .  
## Fireplaces      13219.6     2180.6    6.06  2.0e-09 ***
## AgeBuilt          -90.2       62.6   -1.44    0.150    
## AgeRemodAdd      -427.5       80.6   -5.31  1.4e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37600 on 891 degrees of freedom
## Multiple R-squared:  0.792,  Adjusted R-squared:  0.79 
## F-statistic:  423 on 8 and 891 DF,  p-value: <2e-16
anova(reg_model)
## Analysis of Variance Table
## 
## Response: SalePrice
##                 Df   Sum Sq  Mean Sq F value  Pr(>F)    
## OverallQual      1 3.82e+12 3.82e+12  2709.2 < 2e-16 ***
## BsmtFinSF1       1 3.18e+11 3.18e+11   225.4 < 2e-16 ***
## GarageArea       1 2.03e+11 2.03e+11   143.7 < 2e-16 ***
## TotRmsAbvGrd     1 3.14e+11 3.14e+11   222.6 < 2e-16 ***
## TotalBathrooms   1 2.18e+10 2.18e+10    15.5 9.1e-05 ***
## Fireplaces       1 3.90e+10 3.90e+10    27.7 1.8e-07 ***
## AgeBuilt         1 1.46e+10 1.46e+10    10.3  0.0014 ** 
## AgeRemodAdd      1 3.97e+10 3.97e+10    28.2 1.4e-07 ***
## Residuals      891 1.26e+12 1.41e+09                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The coefficients for each variable have associated p-values (Pr(>|t|)). These p-values indicate whether each predictor variable is statistically significant in predicting the response variable. In this case, all predictor variables have very small p-values (<< 0.05), suggesting they are statistically significant . The F-statistic tests the overall significance of the model. The associated p-value (Pr(>F)) is extremely small (less than 2e-16), indicating that the model is statistically significant..

Multiple R-squared (0.791) represents the proportion of variance in the response variable (Sale Price) that is explained by the predictor variables. Adjusted R-squared (0.8789) adjusts for the number of predictor variables.

The model seems to have a good fit (high R-squared value), and the individual predictor variables appear to be statistically significant in predicting Sale Price.

Post Analysis of the model

# Residual Analysis
residuals <- residuals(reg_model)

# Plot residuals vs. fitted values
plot(reg_model$fitted.values, residuals, main="Residuals vs Fitted", xlab="Fitted values", ylab="Residuals")
abline(h=0, col="red", lty=2)

# Plot a histogram of residuals
hist(residuals, main="Histogram of Residuals", xlab="Residuals")

# Check for Normality of Residuals
# Q-Q plot of standardized residuals
qqnorm(residuals,col="red")
qqline(residuals,col="red")

The plot of residuals vs. fitted values does not exhibit any clear patterns, and the residuals appear to be randomly scattered around the horizontal axis. This suggests that the model is appropriately capturing the relationship between the predictors and the response variable.. The Q-Q plot of standardized residuals shows that the points closely follow the diagonal line. This suggests that the residuals are approximately normally distributed, which is a positive indication for the normality assumption. There are no clear trends, U-shapes, or other systematic patterns in the residuals. This further supports the idea that the model is capturing the underlying patterns in the data..

Testing the data against the predict data and performing similar operations on the data

raw_data_test <- read.csv("data/Predict.csv")

# Specify the variables to normalize
variables_to_normalize_test <- c("LotArea", "BsmtFinSF1", "GarageArea")

# Extract the selected variables
data_to_normalize_test <- raw_data_test[, variables_to_normalize_test]

# Calculate mean and standard deviation for normalization
means <- colMeans(data_to_normalize_test)
std_devs <- apply(data_to_normalize_test, 2, sd)

# Z-score normalization
normalized_data <- scale(data_to_normalize_test, center = means, scale = std_devs)

# Add the normalized variables back to the original data
raw_data_test[, variables_to_normalize_test] <- normalized_data
#Converting the baths information to Half and Full baths 
raw_data_test$ConvertedHalfBath <- ifelse(raw_data_test$HalfBath == 0, 0,
                                 ifelse(raw_data_test$HalfBath == 1, 0.5,
                                        ifelse(raw_data_test$HalfBath == 2, 1, NA)))
# Calculate ages from the YrSold to find the properity of the age
raw_data_test$AgeBuilt <- raw_data_test$YrSold - raw_data_test$YearBuilt
raw_data_test$AgeRemodAdd <- raw_data_test$YrSold - raw_data_test$YearRemodAdd
raw_data_test$TotalBathrooms = raw_data_test$FullBath + raw_data_test$ConvertedHalfBath
#test regression model
predictions <- predict(reg_model, newdata = raw_data_test)

# Assuming 'actual_values' is the column in 'test_data' containing the actual SalePrice values
actual_values <- raw_data_test$SalePrice

# Calculate evaluation metrics
mse <- mean((predictions - actual_values)^2)
rmse <- sqrt(mse)
mae <- mean(abs(predictions - actual_values))
r_squared <- 1 - (sum((actual_values - predictions)^2) / sum((actual_values - mean(actual_values))^2))

cat("Mean Squared Error (MSE):", mse, "\n")
## Mean Squared Error (MSE): 842614992
cat("Root Mean Squared Error (RMSE):", rmse, "\n")
## Root Mean Squared Error (RMSE): 29028
cat("Mean Absolute Error (MAE):", mae, "\n")
## Mean Absolute Error (MAE): 22254
cat("R-squared:", r_squared, "\n")
## R-squared: 0.7725
# Optionally, you can also visualize the predictions vs. actual values
plot(predictions, actual_values, main="Predicted vs Actual", xlab="Predicted", ylab="Actual")
abline(a = 0, b = 1, col = "red", lty = 2)  # Add a diagonal line for reference

A comprehensive evaluation of our linear regression model for predicting Sales Prices using both quantitative metrics and visual examination. The model was trained on a dataset of 900 observations, and its performance was assessed on a separate test dataset of 90 observations..

Evaluation Metrics: Mean Squared Error (MSE): The model achieved a Mean Squared Error of 8.4e+08 signifying the average squared difference between predicted and actual Sale Prices. Lower MSE values are indicative of better predictive accuracy..

Root Mean Squared Error (RMSE): With an RMSE of 28,983, our model’s predictions, on average, deviate by approximately $29,028 from the actual Sale Prices. A lower RMSE suggests improved accuracy..

Mean Absolute Error (MAE): The Mean Absolute Error is 22,431, reflecting the average absolute difference between predicted and actual Sale Prices. This metric is useful for understanding the average magnitude of prediction errors..

R-squared: The R-squared value of 0.7725 indicates that our model explains approximately 77.25% of the variance in Sale Prices. A higher R-squared suggests a better fit to the data..

Predicted vs. Actual Values Plot: A visual inspection of the predicted vs. actual values plot further supports the model’s effectiveness. The plot exhibits a clear linear relationship, indicating that the model’s predictions align closely with the actual Sale Prices. The consistency in the alignment across the range of observations suggests that our linear regression model is capturing the underlying patterns in the data..

                                Descision tree Model
                                                                
#Decision Tree Model with out pruning the data
library(rpart)
# Build the decision tree model
tree_model <- rpart(SalePrice ~  LotArea+OverallQual+BsmtFinSF1+TotRmsAbvGrd+Fireplaces+GarageArea+AgeRemodAdd, data = House_num, method = 'anova' )

# Display the complexity parameter table
printcp(tree_model)
## 
## Regression tree:
## rpart(formula = SalePrice ~ LotArea + OverallQual + BsmtFinSF1 + 
##     TotRmsAbvGrd + Fireplaces + GarageArea + AgeRemodAdd, data = House_num, 
##     method = "anova")
## 
## Variables actually used in tree construction:
## [1] BsmtFinSF1   GarageArea   OverallQual  TotRmsAbvGrd
## 
## Root node error: 6e+12/900 = 6.7e+09
## 
## n= 900 
## 
##      CP nsplit rel error xerror  xstd
## 1 0.478      0      1.00   1.00 0.090
## 2 0.116      1      0.52   0.52 0.047
## 3 0.058      2      0.41   0.41 0.044
## 4 0.028      3      0.35   0.36 0.036
## 5 0.020      4      0.32   0.35 0.038
## 6 0.018      5      0.30   0.35 0.038
## 7 0.014      6      0.28   0.34 0.035
## 8 0.011      7      0.27   0.32 0.031
## 9 0.010      8      0.26   0.31 0.031
plotcp(tree_model)

As we go down the table, CP decreases, indicating less complex trees. However, rel error and xerror also increase, meaning the predictions become less accurate.At nsplit = 7, the relative error decreases but the cross validation error increases indicates the point at which the data overfits

#Plotting the descision tree model
library(rattle)
fancyRpartPlot(tree_model)

# Build the decision tree model with pruning and a minimum split of 60 
tree_model_pruned <- rpart(
  SalePrice ~ LotArea + OverallQual + BsmtFinSF1 + TotRmsAbvGrd + Fireplaces + GarageArea + AgeRemodAdd, 
  data = House_num, 
  method = 'anova',
  control = rpart.control(minsplit = 60, cp = 0.01) 
)

# Display the complexity parameter table for the pruned tree
printcp(tree_model_pruned)
## 
## Regression tree:
## rpart(formula = SalePrice ~ LotArea + OverallQual + BsmtFinSF1 + 
##     TotRmsAbvGrd + Fireplaces + GarageArea + AgeRemodAdd, data = House_num, 
##     method = "anova", control = rpart.control(minsplit = 60, 
##         cp = 0.01))
## 
## Variables actually used in tree construction:
## [1] BsmtFinSF1  GarageArea  OverallQual
## 
## Root node error: 6e+12/900 = 6.7e+09
## 
## n= 900 
## 
##      CP nsplit rel error xerror  xstd
## 1 0.478      0      1.00   1.00 0.090
## 2 0.116      1      0.52   0.53 0.047
## 3 0.058      2      0.41   0.41 0.044
## 4 0.028      3      0.35   0.37 0.034
## 5 0.018      4      0.32   0.35 0.034
## 6 0.014      5      0.30   0.35 0.034
## 7 0.011      6      0.29   0.34 0.033
## 8 0.010      7      0.28   0.33 0.033
plotcp(tree_model_pruned)

library(rattle)
fancyRpartPlot(tree_model_pruned)

Analysis before and after pruning the data

# Before Pruning
predictions_before_pruning_test <- predict(tree_model, newdata = raw_data_test)
mse_before_pruning_test <- mean((predictions_before_pruning_test - raw_data_test$SalePrice)^2)
accuracy_before_pruning_test <- 1 - mse_before_pruning_test/var(raw_data_test$SalePrice)

# After Pruning
predictions_after_pruning_test <- predict(tree_model_pruned, newdata = raw_data_test)
mse_after_pruning_test <- mean((predictions_after_pruning_test - raw_data_test$SalePrice)^2)
accuracy_after_pruning_test <- 1 - mse_after_pruning_test/var(raw_data_test$SalePrice)

# Display Results
cat("Accuracy Before Pruning (Test Data):", round(accuracy_before_pruning_test * 100, 2), "%\n")
## Accuracy Before Pruning (Test Data): 65.65 %
cat("Accuracy After Pruning (Test Data):", round(accuracy_after_pruning_test * 100, 2), "%\n")
## Accuracy After Pruning (Test Data): 62.59 %

Before pruning, your model achieved an accuracy of 65.65% on the test data. This is the performance of the model without any pruning, meaning the tree was allowed to grow without restrictions..

After pruning, the model achieved an accuracy of 62.59% on the test data. Pruning involves removing branches from the tree to prevent overfitting. In this case, it seems that pruning led to a decrease in accuracy. While pruning can help prevent overfitting on the training data, it may result in a slightly less accurate model on the test data..

# Make predictions on the test data
predictions <- predict(tree_model, newdata = raw_data_test, type = "vector")

# Check the structure of predictions
str(predictions)
##  Named num [1:90] 125471 125471 202039 202039 125471 ...
##  - attr(*, "names")= chr [1:90] "1" "2" "3" "4" ...
# Assuming your target variable is called 'target_variable'
# Create a vector of predicted values
predicted_values <- as.numeric(predictions)

# Calculate evaluation metrics based on your specific task (e.g., Mean Absolute Error for regression)
mae <- mean(abs(predicted_values - raw_data_test$SalePrice))

# Print the evaluation metric
cat("Mean Absolute Error (MAE):", mae, "\n")
## Mean Absolute Error (MAE): 27362
# Calculate the R-squared value manually
actual_values <- raw_data_test$SalePrice
residuals <- actual_values - predictions
ss_total <- sum((actual_values - mean(actual_values))^2)
ss_residual <- sum(residuals^2)
r_squared <- 1 - (ss_residual / ss_total)

# Print the R-squared value
print(paste("R-squared value:", round(r_squared, 4)))
## [1] "R-squared value: 0.6527"
# Calculate the adjusted R-squared value
num_predictors <- length(tree_model$variable.importance)  # Number of predictors in the model
num_obs <- nrow(raw_data_test)  # Number of observations

adjusted_r_squared <- 1 - (1 - r_squared) * ((num_obs - 1) / (num_obs - num_predictors - 1))

# Print the adjusted R-squared value
print(paste("Adjusted R-squared value:", round(adjusted_r_squared, 4)))
## [1] "Adjusted R-squared value: 0.623"

Mean Absolute Error (MAE):

The Tree Model has a higher MAE (27362) compared to the Linear Model (22254).. A lower MAE indicates better model performance, so the Linear Model performs better in terms of MAE..

The Linear Model outperforms the Tree Model in terms of Mean Absolute Error (MAE).. The Linear Model has a relatively high R-squared value, indicating a good fit to the data.. Consider the specific requirements of your task and the interpretability of each model when choosing the best model for your scenario.

Classification Model

library(dplyr)
library(ROCR)
# Loading House_Prices csv data
House_Prices <- read.csv("data/House_Prices.csv", header = TRUE, sep = ",", stringsAsFactors = TRUE)

# Creating a binary variable 'ConvertedOverallQual' based on condition
House_Prices$ConvertedOverallQual <- ifelse(House_Prices$OverallQual < 7, 0,
                                            ifelse(House_Prices$OverallQual >= 7, 1, NA))

# Converting 'ConvertedOverallQual' to a factor
House_Prices$ConvertedOverallQual <- as.factor(House_Prices$ConvertedOverallQual)

# Removing rows with NA in the response variable
House_Prices <- House_Prices[!is.na(House_Prices$ConvertedOverallQual), ]
House_Prices <- House_Prices %>% dplyr::select(-OverallQual)
# Building logistic regression model
class_model <- glm(ConvertedOverallQual ~. , data = House_Prices, family = "binomial")
summary(class_model)
## 
## Call:
## glm(formula = ConvertedOverallQual ~ ., family = "binomial", 
##     data = House_Prices)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.311  -0.347  -0.138   0.191   3.377  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   8.65e+01   1.81e+02    0.48  0.63222    
## LotArea      -3.36e-05   9.23e-06   -3.64  0.00027 ***
## YearBuilt     1.07e-02   6.19e-03    1.72  0.08466 .  
## YearRemodAdd  1.77e-02   9.26e-03    1.91  0.05556 .  
## BsmtFinSF1   -1.91e-03   3.45e-04   -5.54  3.1e-08 ***
## FullBath      3.76e-01   3.31e-01    1.13  0.25680    
## HalfBath     -1.26e-01   2.59e-01   -0.49  0.62672    
## BedroomAbvGr -6.62e-01   2.56e-01   -2.58  0.00979 ** 
## TotRmsAbvGrd  2.11e-01   1.46e-01    1.45  0.14795    
## Fireplaces    1.71e-01   2.08e-01    0.82  0.41145    
## GarageArea    1.96e-03   1.03e-03    1.90  0.05679 .  
## YrSold       -7.53e-02   9.04e-02   -0.83  0.40507    
## SalePrice     4.30e-05   5.10e-06    8.43  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1195.32  on 899  degrees of freedom
## Residual deviance:  471.83  on 887  degrees of freedom
## AIC: 497.8
## 
## Number of Fisher Scoring iterations: 7
library(readxl)
BA_Predict <- read_excel("data/Predict.xlsx")
# Creating a binary variable 'ConvertedOverallQual' based on condition
BA_Predict$ConvertedOverallQual <- ifelse(BA_Predict$OverallQual < 7, 0,
                                            ifelse(BA_Predict$OverallQual >= 7, 1, NA))

# Converting 'ConvertedOverallQual' to a factor
BA_Predict$ConvertedOverallQual <- as.factor(BA_Predict$ConvertedOverallQual)

# Removing rows with NA in the response variable
BA_Predict <- BA_Predict[!is.na(BA_Predict$ConvertedOverallQual), ]
BA_Predict <- BA_Predict %>% dplyr::select(-OverallQual)
library(caret)
predict_reg=predict(class_model,newdata=BA_Predict, type = "response")
predict_reg=ifelse(predict_reg > 0.5,1,0)

predict_reg = factor(predict_reg, levels = levels(House_Prices$ConvertedOverallQual))
predict_reg
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
##  0  0  1  1  0  0  1  1  0  1  0  0  0  0  1  0  0  0  1  0  0  0  1  1  1  0 
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 
##  1  0  1  1  1  0  1  1  1  0  0  1  1  1  0  1  0  0  0  0  0  1  1  0  0  0 
## 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 
##  0  0  0  0  0  0  0  0  0  1  0  1  1  1  0  0  0  0  0  1  0  1  0  0  0  1 
## 79 80 81 82 83 84 85 86 87 88 89 90 
##  0  0  0  1  1  1  0  0  0  1  1  1 
## Levels: 0 1
levels(BA_Predict$ConvertedOverallQual)
## [1] "0" "1"
levels(predict_reg)
## [1] "0" "1"
str(predict_reg)
##  Factor w/ 2 levels "0","1": 1 1 2 2 1 1 2 2 1 2 ...
##  - attr(*, "names")= chr [1:90] "1" "2" "3" "4" ...
table(predict_reg)
## predict_reg
##  0  1 
## 54 36
table(predict_reg,BA_Predict$ConvertedOverallQual)
##            
## predict_reg  0  1
##           0 47  7
##           1  8 28
X = confusionMatrix(BA_Predict$ConvertedOverallQual,predict_reg)
X
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 47  8
##          1  7 28
##                                        
##                Accuracy : 0.833        
##                  95% CI : (0.74, 0.904)
##     No Information Rate : 0.6          
##     P-Value [Acc > NIR] : 1.59e-06     
##                                        
##                   Kappa : 0.651        
##                                        
##  Mcnemar's Test P-Value : 1            
##                                        
##             Sensitivity : 0.870        
##             Specificity : 0.778        
##          Pos Pred Value : 0.855        
##          Neg Pred Value : 0.800        
##              Prevalence : 0.600        
##          Detection Rate : 0.522        
##    Detection Prevalence : 0.611        
##       Balanced Accuracy : 0.824        
##                                        
##        'Positive' Class : 0            
## 

True Positive (TP): 47 (Actual class: 0, Predicted class: 0). False Positive (FP): 8 (Actual class: 0, Predicted class: 1). False Negative (FN): 7 (Actual class: 1, Predicted class: 0). True Negative (TN): 28 (Actual class: 1, Predicted class: 1).

accuracy <- X$overall["Accuracy"]

precision <- X$byClass["Pos Pred Value"]

accuracy
## Accuracy 
##   0.8333
precision
## Pos Pred Value 
##         0.8545

Accuracy: 83.33% Accuracy is the proportion of correctly classified instances out of the total number of instances. In this case, the model has an overall accuracy of 83.33%, meaning it correctly predicted the class for approximately 83.33% of the observations..

Positive Predictive Value (Precision): 85.45% Precision, also known as the Positive Predictive Value, measures the proportion of true positive predictions among all positive predictions made by the model. In this case, the positive predictive value is 85.45%. This indicates that when the model predicts the positive class, it is correct about 85.45% of the time..

ROC Curve for the Metrics

predict_reg=predict(class_model,newdata=BA_Predict, type = "response")
predict_reg=ifelse(predict_reg > 0.5,1,0)
pred <- prediction(predict_reg, BA_Predict$ConvertedOverallQual)
roc.perf = performance(pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf, main = "ROC Curve", col = "blue")
abline(a = 0, b = 1, col = "red")

ROC curve visually represents the trade-off between true positive rate and false positive rate at different probability thresholds. The reference line (diagonal line) represents a random classifier, and the goal is for the ROC curve to be as far away from this line as possible (toward the upper-left corner).. ROC curves for accuracy, sensitivity, and precision, respectively.

auc.perf = performance(pred, measure = "auc")
auc.perf@y.values
## [[1]]
## [1] 0.8273
acc.perf = performance(pred, measure = "acc")
plot(acc.perf)

rec.perf = performance(pred, measure = "rec")
plot(rec.perf)

prec.perf = performance(pred, measure = "prec")
plot(prec.perf)

Subsequent blocks focus on other important metrics such as accuracy, recall (sensitivity), and precision. These metrics provide insights into different aspects of the model’s performance. These graphs curves can help us choose an appropriate decision threshold for your classification model